suppressPackageStartupMessages({
library(ggplot2)
library(plotly)
library(data.table)
})
set.seed(1614)Linear Regression
Least Squares
n <- 50
d <- data.frame(x = rnorm(n))
d$y <- d$x * 2 + rnorm(n)
mod <- lm(y ~ x, data = d)
d$fitted <- mod$fitted.values
ggplot(d, aes(x = x, y = y)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_segment(aes(xend = x, yend = fitted),
color = "red") +
labs(title = "Least Squares",
subtitle = "Minimizing the sum of squared errors") +
theme(
plot.subtitle = element_text(colour = "red")
)`geom_smooth()` using formula = 'y ~ x'
From Scratch - Simple Regression
\(Y = a + b X + \varepsilon\)
# acquire data
Y <- as.matrix(iris[, "Petal.Length"])# col matrix
X <- as.matrix(iris[, c("Sepal.Length")]) # col matrix
# this is the correct answer, as computed by R
(m0 <- lm(Y ~ X))
Call:
lm(formula = Y ~ X)
Coefficients:
(Intercept) X
-7.101 1.858
# let's estimate b (the slope) and a (the intercept) by hand
## one way to think of this - b = cov(X, Y) / var(X)
## and then - a = E[Y] - b E[X]
## but we have to use sample estimates, so replace expectations with averages
(b_hat <- cov(X, Y) / var(X)) [,1]
[1,] 1.858433
## or, do it by hand
(b_hat <- (mean(X * Y) - mean(X)*mean(Y)) /
(mean(X^2) - mean(X)^2) )[1] 1.858433
## leaving
(a_hat <- mean(Y) - b_hat * mean(X))[1] -7.101443
data.frame(x = X, y = Y, yhat = a_hat + b_hat * X) |>
ggplot() +
geom_point(aes(x = x, y = y)) +
geom_line(aes(x = x, y = yhat), color = "blue") +
theme_minimal() +
labs(title = "Simple Linear Regression")Let’s try multiple regression.
Matrix form:
\(Y = X \beta + \varepsilon\)
\(\hat{\beta} = (X^\top X)^{-1}X^\top Y\) (if \(X^\top X\) is invertible)
# let's switch to multiple regression, and use matrix notation for convenience
Y <- as.matrix(iris[, "Petal.Length"])# col matrix
X <- as.matrix(iris[, c("Petal.Width", "Sepal.Length")])
# this is the right answer, from R
(m0 <- lm(Y ~ X))
Call:
lm(formula = Y ~ X)
Coefficients:
(Intercept) XPetal.Width XSepal.Length
-1.5071 1.7481 0.5423
# now, by hand
## add constant vector to create the model/design matrix
##> model.matrix(~ X), or do it by hand
Xm <- cbind(1, X)
head(Xm) Petal.Width Sepal.Length
[1,] 1 0.2 5.1
[2,] 1 0.2 4.9
[3,] 1 0.2 4.7
[4,] 1 0.2 4.6
[5,] 1 0.2 5.0
[6,] 1 0.4 5.4
## calculate beta vector - note, solve(A) computes the inverse of A
(betas_hat <- solve(t(Xm) %*% Xm) %*% t(Xm) %*% Y) [,1]
-1.5071384
Petal.Width 1.7481029
Sepal.Length 0.5422556
## now we can predict Y
Y_hat <- Xm %*% betas_hat
all.equal(c(Y_hat), m0$fitted.values, check.attributes = FALSE)[1] TRUE
## by the way, we can calculate the projection matrix which projects Y onto the
## column span of X - another way to produce our estimate, \hat{Y}
P <- Xm %*% solve(t(Xm) %*% Xm) %*% t(Xm)
dim(P) #n x n[1] 150 150
Y_hat2 <- P %*% Y
head(data.frame(lm = m0$fitted.values,
Xb = Y_hat,
Py = Y_hat2)) lm Xb Py
1 1.607986 1.607986 1.607986
2 1.499535 1.499535 1.499535
3 1.391084 1.391084 1.391084
4 1.336858 1.336858 1.336858
5 1.553760 1.553760 1.553760
6 2.120283 2.120283 2.120283
# with two explanatory variables, we are finding the 'plane of best fit'
x1 <- seq(min(X[, 1]), max(X[, 1]), length = 300)
x2 <- seq(min(X[, 2]), max(X[, 2]), length = length(x1))
ymat <- c(cbind(1, x1, x2) %*% betas_hat) * # y_hat
matrix(1, nrow = length(x1), ncol = length(x1))
d <- data.frame(x = X[, 1], y = X[, 2], z = Y)
plot_ly(d, x = ~x, y = ~y, z = ~z) |>
add_markers() |>
add_surface(x = ~x1, y = ~x2, z = ~ymat)Regression with Inference
Model: \(Y_i = X_i^\top \beta + \varepsilon_i\)
Stacked into matrix form: \(Y = X \beta + \varepsilon\)
Let’s assume \(X\) is deterministic.
We’ll make our Gauss-Markov assumptions:
- \(rank(X) = p\)
- homoskedastic noise term - so the variance of \(\varepsilon\) is the same for all \(i\)
- \(\varepsilon_i \sim N(0, \sigma^2)\), for some unkown variance (noise is Gaussian with mean 0)
Which allows us to perform some statistical inference and extend the least squares algorithm.
We say \(Y \sim N_n(X \beta, \sigma^2 I_n)\) (if \(X\) was random, we would write \(Y|X\)).
set.seed(1614)
# sim some data
n <- 10000
# design matrix
# - we treat as deterministic, but first have to "observe" some data
X <- cbind(1, rnorm(n))
# outcome Y
beta <- c(0, 5)
s2 <- 4 # sigma^2, variance, common to all individuals i
Y <- rnorm(n,
mean = X %*% beta,
sd = sqrt(s2))
# now we observe X and Y, but do not know beta or sigma^2
ggplot() +
geom_point(aes(x = X[, 2], y = Y), alpha = 0.10)# beta - right answer, courtesy of R
m0 <- lm(Y ~ X[,2])
summary(m0)$coef[, "Estimate"](Intercept) X[, 2]
0.004788303 4.987385782
# by hand:
as.vector(beta_hat <- solve(t(X) %*% X) %*% t(X) %*% Y)[1] 0.004788303 4.987385782
Y_hat <- X %*% beta_hat
# by the way, we can compute an unbiased estimator of sigma^2 (which is == 4)
# as prediction-error / (n - p)
# where prediction-error is the sum of squared residuals, n is sample size, p is
# the dimension of the parameter vector.
p <- ncol(X)
(s2_hat <- sum((Y - Y_hat)^2) / (n - p))[1] 4.003794
summary(m0)$sigma^2[1] 4.003794
# and yes, this is ~ equal to the empirical variance of the residuals
var(Y - Y_hat) [,1]
[1,] 4.003394
# we can also check the following relationship:
# E[ ||Y - X * beta ||^2 ] = sigma^2 * (n - p)
# prediction-error = variance * (n - p) in expectation
# (which is why the variance estimator is prediction-error/(n - p))
(sum((Y - Y_hat)^2))[1] 40029.94
(s2 * (n - p))[1] 39992
# Standard Errors:
## compute covariance matrix:
## i.e., empirical variance * inverse of (X^T X)
(covmat <- s2_hat * solve(t(X) %*% X)) [,1] [,2]
[1,] 4.003808e-04 -7.448360e-07
[2,] -7.448360e-07 3.959545e-04
unname(vcov(m0)) [,1] [,2]
[1,] 4.003808e-04 -7.448360e-07
[2,] -7.448360e-07 3.959545e-04
## compute SEs
(se <- sqrt(diag(covmat)))[1] 0.02000952 0.01989860
unname(summary(m0)$coef[, "Std. Error"])[1] 0.02000952 0.01989860
## compute T test statistic
## H0: beta_j = 0, H1: beta_j != 0
as.vector(Tn <- beta_hat / se)[1] 0.2393013 250.6399715
unname(summary(m0)$coef[, "t value"])[1] 0.2393013 250.6399715
## and p-vals
as.vector(2 * (1 - pt(Tn, df = n - p)))[1] 0.8108769 0.0000000
unname(summary(m0)$coef[, "Pr(>|t|)"])[1] 0.8108769 0.0000000
simY <- function(n) {
r <- replicate(1000,
rnorm(n, mean = X %*% beta_hat, sd = sqrt(s2_hat)))
return(rowMeans(r))
}
ggplot() +
geom_density(aes(x = simY(50)), color = "grey") +
geom_density(aes(x = simY(100)), color = "lightblue") +
geom_density(aes(x = simY(1000)), color = "cyan") +
geom_density(aes(x = Y)) +
labs(x="Y", title="Simulations of Y using estimated parameters")More importantly we can do inference on the parameters of the model.
We can use a t-test to test if a given \(\beta_j\) is significantly different from 0:
# R's summary runs some tests which we can compare our manual work to
summary(m0)
Call:
lm(formula = Y ~ X[, 2])
Residuals:
Min 1Q Median 3Q Max
-7.862 -1.354 -0.013 1.367 9.320
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.004788 0.020010 0.239 0.811
X[, 2] 4.987386 0.019899 250.640 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.001 on 9998 degrees of freedom
Multiple R-squared: 0.8627, Adjusted R-squared: 0.8627
F-statistic: 6.282e+04 on 1 and 9998 DF, p-value: < 2.2e-16
#H0: beta_j = 0
#H1: beta_j != 0
j <- 2 # test the second coordinate of beta, not the intercept
# test stat
## empirical covariance matrix of beta
(gamma <- s2_hat * solve(t(X) %*% X)) [,1] [,2]
[1,] 4.003808e-04 -7.448360e-07
[2,] -7.448360e-07 3.959545e-04
## this is the standard error of beta_j
## it is the sqrt of the jth diagonal (variance) of the covariance matrix
(se <- sqrt(gamma[j,j]))[1] 0.0198986
## the test stat follows Student's T distribution with n-p d.o.f.
(T_n <- (beta_hat[j] - 0) / se)[1] 250.64
## test at the 5% level
(q_alpha <- qt(1 - 0.05/2, df = n - p))[1] 1.960201
(reject_null <- abs(T_n) > q_alpha)[1] TRUE
## calculate p-value
## (less than .Machine$double.eps, prob. of more extreme T_n is very low)
(pval <- 2 * (1 - pt(abs(T_n), df = n - p)))[1] 0
In the lecture we learned that Student’s T test for regression can be expressed as a specific case of a more general testing paradigm, where one tests the null \(G \hat{\beta} = \lambda\) vs. \(G \hat{\beta} \ne \lambda\), for some matrix \(G\) and some vector \(\lambda\).
A popular regression test is the “regression F test”, which is meant to test jointly whether any of the \(\beta_j\) are significant from 0. This can be computed using the above framework if we let \(G = I_p\) (identity), where \(p\) is the number of parameters we are testing, and \(\lambda = \vec{0}\).
# General F test for linear regression
# (of form, G %*% beta = lambda, see end of regression notes):
#
# sim some new data
set.seed(1614)
n <- 10000
## design matrix
X <- cbind(1, rnorm(n), rnorm(n))
p <- ncol(X)
## outcome Y
beta <- c(0, 5, 10)
s2 <- 4 # sigma^2, variance, common to all individuals i
Y <- rnorm(n,
mean = X %*% beta,
sd = sqrt(s2))
## estimation
(beta_hat <- solve(t(X) %*% X) %*% t(X) %*% Y) [,1]
[1,] -0.00604906
[2,] 5.00926400
[3,] 10.00045826
Y_hat <- X %*% beta_hat
(s2_hat <- sum((Y - Y_hat)^2) / (n - p))[1] 3.94247
summary(lm(Y ~ X[, 2:p]))
Call:
lm(formula = Y ~ X[, 2:p])
Residuals:
Min 1Q Median 3Q Max
-7.7287 -1.3426 0.0065 1.3532 7.6751
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.006049 0.019856 -0.305 0.761
X[, 2:p]1 5.009264 0.019746 253.685 <2e-16 ***
X[, 2:p]2 10.000458 0.019848 503.846 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 1.986 on 9997 degrees of freedom
Multiple R-squared: 0.9694, Adjusted R-squared: 0.9694
F-statistic: 1.583e+05 on 2 and 9997 DF, p-value: < 2.2e-16
# H0: beta_2 = beta_3 = 0
# H1: at least one beta_j != 0
## identity matrix to index each beta_j (dim k x p)
k <- p
G <- diag(1, k, p)
lambda <- 0
# Compute test statistic
# which follows the F distribution with k (number of parameters being tested)
# and (n-p) d.o.f.
T_n <- (1 / s2_hat) *
t(G %*% beta_hat - lambda) %*%
solve(G %*% solve(t(X) %*% X %*% t(G))) %*%
(G %*% beta_hat - lambda)
# note, R uses k-1 as the F-dist's 1st d.o.f., whereas notes say it is k
(T_n <- T_n / (k - 1)) [,1]
[1,] 158305.5
# test at 5% level
(q_alpha <- qf(1 - 0.05/2, df1 = k - 1, df2 = n - p)) #not n-pp[1] 3.690241
(reject_null <- T_n > q_alpha) [,1]
[1,] TRUE
# p-value
(pval <- 2 * (1 - pf(T_n, df1 = k - 1, df2 = n - p))) [,1]
[1,] 0